{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.probability import FreqDist\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "tokenizer = RegexpTokenizer(r'\\w+')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 205147 entries, 0 to 205146\n",
      "Data columns (total 3 columns):\n",
      "title           205146 non-null object\n",
      "total_shares    205147 non-null int64\n",
      "url             205147 non-null object\n",
      "dtypes: int64(1), object(2)\n",
      "memory usage: 4.7+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_csv(\"alltopcontent.csv\")\n",
    "data = data[[\"title\",\"total_shares\", \"url\"]]\n",
    "print(data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "allwords = []\n",
    "for i in range(0,205146):\n",
    "    title = data['title'][i]\n",
    "    title = str(title)\n",
    "    title = title.lower()\n",
    "    titlewords = tokenizer.tokenize(title)\n",
    "    for i in titlewords:\n",
    "        allwords.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "allwordsstring = \"\"\n",
    "for word in allwords:\n",
    "    #print(word)\n",
    "    allwordsstring = allwordsstring + \" \" + str(word)\n",
    "text_file = open(\"alltitlewords.txt\", \"w\")\n",
    "text_file.write(allwordsstring)\n",
    "text_file.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "freqdist = FreqDist(allwords)\n",
    "freqlist = freqdist.most_common()\n",
    "freqliststring = \"\"\n",
    "for i in freqlist:\n",
    "    i = str(i)\n",
    "    freqliststring = freqliststring + \" \" + i\n",
    "    #print(i)\n",
    "text_file = open(\"wordfrequencylist.txt\", \"w\")\n",
    "text_file.write(freqliststring)\n",
    "text_file.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos = nltk.pos_tag(allwords)\n",
    "nouns = []\n",
    "for p in pos:\n",
    "    if p[1] == 'NN':\n",
    "        nouns.append(p[0])\n",
    "nounfreq = FreqDist(nouns)\n",
    "nounfreqlist = nounfreq.most_common()\n",
    "nounfreqliststring = \"\"\n",
    "for i in nounfreqlist:\n",
    "    i = str(i)\n",
    "    nounfreqliststring = nounfreqliststring + \" \" + i\n",
    "    #print(i)\n",
    "text_file = open(\"nounfrequencylist.txt\", \"w\")\n",
    "text_file.write(nounfreqliststring)\n",
    "text_file.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos = nltk.pos_tag(allwords)\n",
    "adjs = []\n",
    "for p in pos:\n",
    "    if p[1] == 'JJ':\n",
    "        adjs.append(p[0])\n",
    "adjfreq = FreqDist(adjs)\n",
    "adjfreqlist = adjfreq.most_common()\n",
    "adjfreqliststring = \"\"\n",
    "for i in adjfreqlist:\n",
    "    i = str(i)\n",
    "    adjfreqliststring = adjfreqliststring + \" \" + i\n",
    "    #print(i)\n",
    "text_file = open(\"adjfrequencylist.txt\", \"w\")\n",
    "text_file.write(adjfreqliststring)\n",
    "text_file.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
